This notebook is made to analyze Walmart, a big retail store. We'll look at data to find important patterns and trends that can help understand how Walmart is doing in its business.¶
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
df = pd.read_csv('walmart-sales-dataset-of-45stores.csv')
df[['Weekly_Sales','Holiday_Flag','Temperature','Fuel_Price','Unemployment']].describe()
Out[3]:
| Weekly_Sales | Holiday_Flag | Fuel_Price | Unemployment | |
|---|---|---|---|---|
| count | 6.435000e+03 | 6435.000000 | 6435.000000 | 6435.000000 |
| mean | 1.046965e+06 | 0.069930 | 3.358607 | 7.999151 |
| std | 5.643666e+05 | 0.255049 | 0.459020 | 1.875885 |
| min | 2.099862e+05 | 0.000000 | 2.472000 | 3.879000 |
| 25% | 5.533501e+05 | 0.000000 | 2.933000 | 6.891000 |
| 50% | 9.607460e+05 | 0.000000 | 3.445000 | 7.874000 |
| 75% | 1.420159e+06 | 0.000000 | 3.735000 | 8.622000 |
| max | 3.818686e+06 | 1.000000 | 4.468000 | 14.313000 |
In [4]:
selecte_col=df[['Weekly_Sales','Holiday_Flag','Temperature','Fuel_Price','Unemployment']]
for col in selecte_col:
sns.displot(df[col], kde=True)
plt.tight_layout()
plt.show()
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\hp\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
In [5]:
print(df.head(10))
df.info()
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price \
0 1 5/2/2010 1643690.90 0 42.31 2.572
1 1 12/2/2010 1641957.44 1 38.51 2.548
2 1 19/02/2010 1611968.17 0 39.93 2.514
3 1 26/02/2010 1409727.59 0 46.63 2.561
4 1 5/3/2010 1554806.68 0 46.5 2.625
5 1 12/3/2010 1439541.59 0 57.79 2.667
6 1 19/03/2010 1472515.79 0 54.58 2.720
7 1 26/03/2010 1404429.92 0 51.45 2.732
8 1 2/4/2010 1594968.28 0 62.27 2.719
9 1 9/4/2010 1545418.53 0 65.86 2.770
CPI Unemployment Unnamed: 8
0 211.0964 8.106 NaN
1 211.2422 8.106 NaN
2 211.2891 8.106 NaN
3 211.3196 8.106 NaN
4 211.3501 8.106 NaN
5 211.3806 8.106 NaN
6 211.2156 8.106 NaN
7 211.0180 8.106 NaN
8 210.8204 7.808 NaN
9 210.6229 7.808 NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Store 6435 non-null int64
1 Date 6435 non-null object
2 Weekly_Sales 6435 non-null float64
3 Holiday_Flag 6435 non-null int64
4 Temperature 6435 non-null object
5 Fuel_Price 6435 non-null float64
6 CPI 6435 non-null float64
7 Unemployment 6435 non-null float64
8 Unnamed: 8 0 non-null float64
dtypes: float64(5), int64(2), object(2)
memory usage: 452.6+ KB
In [8]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.isnull().sum()
Out[8]:
Store 0 Date 3870 Weekly_Sales 0 Holiday_Flag 0 Temperature 0 Fuel_Price 0 CPI 0 Unemployment 0 Unnamed: 8 6435 dtype: int64
In [9]:
df.duplicated().sum()
Out[9]:
0
In [10]:
#i will use iqr for this task
selected_columns = ['Unemployment', 'CPI', 'Fuel_Price', 'Weekly_Sales', 'Temperature']
for col in selected_columns:
plt.figure(figsize=(8, 8))
sns.boxplot(x=df[col])
plt.title(f'Box Plot of {col}')
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[10], line 6 4 for col in selected_columns: 5 plt.figure(figsize=(8, 8)) ----> 6 sns.boxplot(x=df[col]) 7 plt.title(f'Box Plot of {col}') 8 plt.show() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2231, in boxplot(data, x, y, hue, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, ax, **kwargs) 2224 def boxplot( 2225 data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, 2226 orient=None, color=None, palette=None, saturation=.75, width=.8, 2227 dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, 2228 **kwargs 2229 ): -> 2231 plotter = _BoxPlotter(x, y, hue, data, order, hue_order, 2232 orient, color, palette, saturation, 2233 width, dodge, fliersize, linewidth) 2235 if ax is None: 2236 ax = plt.gca() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:785, in _BoxPlotter.__init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth) 781 def __init__(self, x, y, hue, data, order, hue_order, 782 orient, color, palette, saturation, 783 width, dodge, fliersize, linewidth): --> 785 self.establish_variables(x, y, hue, data, orient, order, hue_order) 786 self.establish_colors(color, palette, saturation) 788 self.dodge = dodge File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:544, in _CategoricalPlotter.establish_variables(self, x, y, hue, data, orient, order, hue_order, units) 541 raise ValueError(err) 543 # Figure out the plotting orientation --> 544 orient = infer_orient( 545 x, y, orient, require_numeric=self.require_numeric 546 ) 548 # Option 2a: 549 # We are plotting a single set of data 550 # ------------------------------------ 551 if x is None or y is None: 552 553 # Determine where the data are File ~\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1601, in infer_orient(x, y, orient, require_numeric) 1599 warnings.warn(single_var_warning.format("Vertical", "x")) 1600 if require_numeric and x_type != "numeric": -> 1601 raise TypeError(nonnumeric_dv_error.format("Horizontal", "x")) 1602 return "h" 1604 elif str(orient).startswith("v"): TypeError: Horizontal orientation requires numeric `x` variable.
<Figure size 800x800 with 0 Axes>
i use IQR for outlines¶
In [ ]:
selected_columns = ['Unemployment', 'Weekly_Sales', 'Temperature']
for col in selected_columns:
q1=df[col].quantile(0.25)
q3=df[col].quantile(0.75)
iqr=q3-q1
width=1.5
lower=q1-(width*iqr)
higher=q3+(width*iqr)
df[col]=np.where(df[col]>higher,higher,np.where(df[col]<lower,lower,df[col]))
In [ ]:
selected_columns = ['Unemployment', 'Weekly_Sales', 'Temperature']
for col in selected_columns:
plt.figure(figsize=(8, 8))
sns.boxplot(x=df[col])
plt.title(f'Box Plot of {col}')
plt.show()
In [17]:
"""
bins = [0, 3, 6, 9, 12]
labels = ['Semester 1', 'Semester 2', 'Semester 3', 'Semester 4']
df['Semester'] = pd.cut(df['Date'].dt.month, bins=bins, labels=labels)
"""
semester_map = {
1: 'Winter', 2: 'Winter', 3: 'Spring',
4: 'Spring', 5: 'Spring', 6: 'Summer',
7: 'Summer', 8: 'Summer', 9: 'Fall',
10: 'Fall', 11: 'Fall', 12: 'Winter'
}
df['Semester'] = df['Date'].dt.month.map(semester_map)
print(df)
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price \
0 1 2010-05-02 1643690.90 0 42.31 2.572
1 1 2010-12-02 1641957.44 1 38.51 2.548
2 1 NaT 1611968.17 0 39.93 2.514
3 1 NaT 1409727.59 0 46.63 2.561
4 1 2010-05-03 1554806.68 0 46.5 2.625
... ... ... ... ... ... ...
6430 45 NaT 713173.95 0 64.88 3.997
6431 45 2012-05-10 733455.07 0 64.89 3.985
6432 45 2012-12-10 734464.36 0 54.47 4.000
6433 45 NaT 718125.53 0 56.47 3.969
6434 45 NaT 760281.43 0 58.85 3.882
CPI Unemployment Unnamed: 8 Semester
0 211.0964 8.106 NaN Spring
1 211.2422 8.106 NaN Winter
2 211.2891 8.106 NaN NaN
3 211.3196 8.106 NaN NaN
4 211.3501 8.106 NaN Spring
... ... ... ... ...
6430 192.0136 8.684 NaN NaN
6431 192.1704 8.667 NaN Spring
6432 192.3273 8.667 NaN Winter
6433 192.3309 8.667 NaN NaN
6434 192.3089 8.667 NaN NaN
[6435 rows x 10 columns]
In [11]:
sales_sum = df.groupby('Store')['Weekly_Sales'].sum() / 1000000
plt.figure(figsize=(10, 6))
sales_sum.plot(kind='bar')
plt.ylabel('sum of Weekly Sales')
plt.title('Sum of Weekly Sales per Store')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
this mean 20 store have the max sales with 2.991m¶
In [12]:
print(sales_sum.head(45))
max_store = sales_sum.idxmax()
max_value = sales_sum[max_store]
print("Store", max_store, "has the maximum sales of", max_value, "M")
Store 1 222.402809 2 275.382441 3 57.586735 4 299.543953 5 45.475689 6 223.756131 7 81.598275 8 129.951181 9 77.789219 10 271.617714 11 193.962787 12 144.287230 13 286.517704 14 288.999911 15 89.133684 16 74.252425 17 127.782139 18 155.114734 19 206.634862 20 301.397792 21 108.117879 22 147.075649 23 198.750618 24 194.016021 25 101.061179 26 143.416394 27 253.855917 28 189.263681 29 77.141554 30 62.716885 31 199.613905 32 166.819246 33 37.160222 34 138.249763 35 131.520672 36 53.412215 37 74.202740 38 55.159626 39 207.445542 40 137.870310 41 181.341935 42 79.565752 43 90.565435 44 43.293088 45 112.395341 Name: Weekly_Sales, dtype: float64 Store 20 has the maximum sales of 301.39779246 M
b) Which store has maximum standard deviation i.e., the sales vary a lot¶
In [13]:
sales_std = df.groupby('Store')['Weekly_Sales'].std()
print(sales_std.head(45))
max_storestd = sales_std.idxmax()
max_valuestd = sales_std[max_storestd]
print("Store", max_storestd, "standard deviation of", max_valuestd.round(2))
Store 1 155980.767761 2 237683.694682 3 46319.631557 4 266201.442297 5 37737.965745 6 212525.855862 7 112585.469220 8 106280.829881 9 69028.666585 10 302262.062504 11 165833.887863 12 139166.871880 13 265506.995776 14 317569.949476 15 120538.652043 16 85769.680133 17 112162.936087 18 176641.510839 19 191722.638730 20 275900.562742 21 128752.812853 22 161251.350631 23 249788.038068 24 167745.677567 25 112976.788600 26 110431.288141 27 239930.135688 28 181758.967539 29 99120.136596 30 22809.665590 31 125855.942933 32 138017.252087 33 24132.927322 34 104630.164676 35 211243.457791 36 60725.173579 37 21837.461190 38 42768.169450 39 217466.454833 40 119002.112858 41 187907.162766 42 50262.925530 43 40598.413260 44 24762.832015 45 130168.526635 Name: Weekly_Sales, dtype: float64 Store 14 standard deviation of 317569.95
In [14]:
plt.figure(figsize=(10, 6))
sales_std.plot(kind='bar')
plt.ylabel('Standard Deviation')
plt.title('Standard Deviation of Weekly Sales per Store')
plt.tight_layout()
plt.show()
c) Some holidays have a negative impact on sales. Find out holidays that have higher sales than the mean sales in the non-holiday season for all stores together.¶
In [15]:
holiday = df.groupby('Holiday_Flag')['Weekly_Sales'].mean()
print("Mean of the non holiday sales : ",holiday[0].round(2))
high_sales = df[df['Holiday_Flag'] ==1][df[df['Holiday_Flag'] ==1]['Weekly_Sales'] >holiday[0]]
num = len(high_sales)
print("Number of rows in the DataFrame:", num)
print(high_sales['Date'].unique())
Mean of the non holiday sales : 1041256.38 Number of rows in the DataFrame: 220 <DatetimeArray> ['2010-12-02 00:00:00', '2010-10-09 00:00:00', 'NaT', '2011-11-02 00:00:00', '2011-09-09 00:00:00', '2012-10-02 00:00:00', '2012-07-09 00:00:00'] Length: 7, dtype: datetime64[ns]
d) Provide a monthly and semester view of sales in units and give insights.¶
In [18]:
# monthly_sales = (df.groupby([df['Date'].dt.year, df['Date'].dt.month])['Weekly_Sales'].sum()).round(2)
monthly_sales = (df.groupby(df['Date'].dt.month)['Weekly_Sales'].sum()).round(2)
semesterly_sales = (df.groupby('Semester')['Weekly_Sales'].sum()).round(2)
#for (year, month), sales in monthly_sales.items():
# print(f"{year}-{month:02}: ${sales:,.0f}")
for month_index, sales in enumerate(monthly_sales, start=1):
print(f"Month {month_index}: ${sales:,.0f}")
for semester, sales in semesterly_sales.items():
print(f"Semester {semester}: ${sales:,.0f}")
plt.figure(figsize=(8, 6))
monthly_sales.plot(kind='bar', subplots=True)
plt.figure(figsize=(8, 6))
semesterly_sales.plot(kind='bar', subplots=True)
Month 1: $181,559,036 Month 2: $240,969,530 Month 3: $239,446,787 Month 4: $239,102,000 Month 5: $237,986,299 Month 6: $244,777,717 Month 7: $186,820,779 Month 8: $188,500,877 Month 9: $245,069,649 Month 10: $246,383,762 Month 11: $235,190,163 Month 12: $232,764,958 Semester Fall: $726,643,574 Semester Spring: $716,535,086 Semester Summer: $620,099,373 Semester Winter: $655,293,525
Out[18]:
array([<Axes: title={'center': 'Weekly_Sales'}, xlabel='Semester'>],
dtype=object)
In [19]:
sales = df.groupby("Date")[["Weekly_Sales"]].sum()
sales.plot(color='red', figsize=(14, 6))
plt.title('Weekly Sales with date ')
plt.figure(figsize=(14, 6))
sns.scatterplot(data=df, x='CPI', y='Weekly_Sales')
plt.title('Weekly Sales with CPI')
plt.figure(figsize=(14, 6))
sns.violinplot(data=df, x='Holiday_Flag', y='Weekly_Sales')
plt.title('Weekly Sales with Holiday Flag ')
plt.figure(figsize = (14,6))
plt.scatter( df['Temperature'] ,df['Weekly_Sales'] )
plt.title("Temperature with weekly_Sales")
sales_store = df.groupby("Store")[["Weekly_Sales"]].sum()
sales_store.plot(color='blue', figsize=(14, 6))
plt.title('Weekly Sales with date ')
Out[19]:
Text(0.5, 1.0, 'Weekly Sales with date ')
In [20]:
selecte_col.plot(subplots=True, grid=True, figsize=(10,15))
Out[20]:
array([<Axes: >, <Axes: >, <Axes: >, <Axes: >], dtype=object)
In [21]:
plt.figure(figsize = (14,6))
sns.set(style='dark')
plt.scatter( df['Temperature'] ,df['Weekly_Sales'] , c = df['Store'])
plt.colorbar()
plt.ylabel("Weekly_Sales")
plt.title("Temperature with weekly_Sales")
Out[21]:
Text(0.5, 1.0, 'Temperature with weekly_Sales')
Correlation between Columns¶
In [23]:
selecte_col = selecte_col.apply(pd.to_numeric, errors='coerce') # Convert non-numeric to NaN
selecte_col = selecte_col.dropna() # Drop rows with NaN values
# Compute correlation matrix
correlation_matrix = selecte_col.corr()
# Plot heatmap
sns.heatmap(correlation_matrix, annot=True, cmap="Blues")
plt.title('Correlation Heatmap')
plt.show()